import requests
import urllib3
from lxml import etree
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

house=[]
def get_detail_url(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' ,
               "Cookie" : "global_cookie=3sxhzc5oindnuxjqz0uj2cdiopyly58xin1; sfut=FF04A7B17F57FA05B47C03DA7B4425EC8244CF29AF40F029908498D0361670E8F4393E9FD1FA0AF31EF410D8EA5E8F1C8E34926D8794047B2BE0673C01C44B316C81BF0DA8F2EFF2F729CB05626F715F3D8FDE2674FA9F3555C7E68AB75D5239; city.sig=OGYSb1kOr8YVFH0wBEXukpoi1DeOqwvdseB7aTrJ-zE; __utmz=147393320.1720069309.10.5.utmcsr=lz.esf.fang.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=147393320.1708965444.1719975381.1720080721.1720097525.12; otherid=7b595a6d98b490f5f503c8c898b5cc80; __utmc=147393320; city=www; csrfToken=upPEX8LsZPzGFCHhq_Vc4hfB; resourceDetail=1; g_sourcepage=esf_fy%5Elb_pc; __utmb=147393320.75.10.1720097525; unique_cookie=U_n3w41yz88apheu7ohofrp5rjb1gly7acpe5*7; new_loginid=129783847; login_username=fang44946405260"}
    try:
        r = requests.get(url, headers=headers, verify=False)
        html = etree.HTML(r.text)
        hrefs = html.xpath('//div[@class="shop_list shop_list_4"]/dl/dt/a/@href')
        # channels = html.xpath('//div[@class="shop_list shop_list_4"]/dl/dt/a/@data_channel')
        # psid = html.xpath('//div[@class="shop_list shop_list_4"]/dl/dt/a/@ps')
        # next_urls1 = ['https://lz.esf.fang.com' + href + '?channel=' + channel + '&psid=' + psid for href, channel,psid in zip(hrefs, channels,psid)]
        next_urls2 = ['https://lz.esf.fang.com' + h for h in hrefs]
        # print(next_urls1)
        print(next_urls2)
        # house.extend(next_urls1)
        house.extend(next_urls2)
    except:
        process_captcha()
        get_detail_url(url)

def process_captcha():

    # 该处url是让出现验证码界面，没有具体的限制
    url = 'https://lz.esf.fang.com/chushou/3_416752691.htm?channel=2,2'
    driver = webdriver.Chrome()
    driver.get(url)
    # 人工输入验证码
    time.sleep(10)
    driver.close()

if __name__ == '__main__':

    house = []
    for j in range(1, 4):
        for i in range(1,101):
            print('-'*36)
            print(f'开始爬取第{i}页...')
            url = f'https://lz.esf.fang.com/house/house-a0{j}/i3{i}/'
            get_detail_url(url)
    print('爬取结束!!!!!!')
    print("-"*50)
    print("一共",len(house),"条数据!!!")
    f = open('urls.txt', 'w', encoding='utf8')
    for i in house:
        f.write(i + '\n')
    f.close()